###---###---###---###---###---###---###---###---###---###---
# Kerice Doten-Snitker, Steve Pfaff, & Yuan Hsiao
# 
# Data management and prep for analyses
#    1) source scripts from Leeson & Russ EJ (2019)
#    2) build city panel, on Becker, Hsiao, Pfaff, & Rubin foundation
#    3) create diffusion measures
#    4) run analyses via compiling Rmd
# 
# R version 4.1.3 (2022-03-10) -- "One Push-Up"
# Platform: x86_64-apple-darwin17.0 (64-bit)
###---###---###---###---###---###---###---###---###---###---

###---###---###---###---###---###---###---###---###---###---###
### Set up the working environment ----------------------------
###---###---###---###---###---###---###---###---###---###---###

# add packages with the pacman library, which installs if not installed
library(pacman)
# use the here package for improving replicability
p_load(here)
# for data manipulation
p_load(magrittr, plyr, dplyr, cubelyr, tidyverse, haven, zoo, reshape2, abind)
# for geographical analyses and calculations
p_load(geosphere, raster, gdistance, rgdal, rgeos, spatstat, ncdf4)
# for various model types and simulations
p_load(rstan, brms, splines2, tidybayes, posterior) # for Bayesian regression
p_load(ggeffects, sjstats)
# for exploration and visuals
p_load(skimr, corrplot, cowplot, bayesplot, sjPlot)
# for output
p_load(tinytex, pander, knitr, stargazer, texreg, gridExtra, kableExtra)

set.seed(1487)
options(scipen=999) # don't use scientific notation

###---###---###---###---###---###---###---###---###---###---
# 1. add Leeson & Russ data to the R environment ####
###---###---###---###---###---###---###---###---###---###---

trials <- read_csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "trials.csv"))
# lat lon coordinates are WGS84/EPSG:4236
# "When record-supplied location information permits, we also assign persons tried (and trial-related deaths) to contemporary cities (see, column ‘city’) or counties (see, column ‘county’), the latter reflecting level-two administrative regions in the Global Administrate Areas Database (GADM, 2012). For these observations, our data set provides latitude and longitude coordinates (see, columns ‘lat’ and ‘lon’); in the case of counties, those of the county centroid." (Appendix E)
# 5803 (~half) with no lat lon - city unknown; county centroids added in script 8-create-grid-panel

# See note below - L&R use "Nuremberg" in all but one obs
trials$city[trials$city=="Nurnberg"] <- "Nuremberg"
# any other geo name errors?
trials$city[trials$city=="kotz"] <- "Kotz"
places <- trials %>%
  dplyr::select(gadm.adm1, gadm.adm2, city, lon, lat) %>%
  distinct(gadm.adm1, gadm.adm2, city, .keep_all=TRUE) %>%
  arrange(city, gadm.adm1) %>% filter(!is.na(city))
which(duplicated(places$city))
dupnames <- places$city[c(43, 149, 159, 337, 361, 409, 420, 456, 525, 602, 681, 781, 786, 866, 880)]
places[places$city %in% dupnames,] %>% print(n = Inf) # print them all!
# fix names; don't worry about coords (will fix later)
trials$gadm.adm2[trials$city=="Augsburg"] <- "Schwaben"
trials$gadm.adm1[trials$city=="Bruges"] <- "Vlaanderen"
trials$gadm.adm2[trials$city=="Burgau"] <- "Schwaben"
trials$gadm.adm2[trials$city=="Heidelberg"] <- "Karlsruhe"
trials$gadm.adm2[trials$city=="Isny im Allgau"] <- "Tubingen"
trials$gadm.adm2[trials$city=="Kempten"] <- "Schwaben"
trials$gadm.adm2[trials$city=="Landshut"] <- "Niederbayern"
trials$gadm.adm2[trials$city=="Memmingen"] <- "Schwaben"
trials$gadm.adm2[trials$city=="Nordlingen"] <- "Schwaben"
trials$gadm.adm2[trials$city=="Regensburg"] <- "Oberpfalz"
trials$gadm.adm2[trials$city=="Stein am Rhein"] <- "Schaffhausen"
trials$gadm.adm2[trials$city=="Straubing"] <- "Niederbayern"
trials$gadm.adm2[trials$city=="Wallerstein"] <- "Schwaben"
trials$gadm.adm2[trials$city=="Weissenhorn"] <- "Schwaben"

# what about multiple observations in the same year?
# this appears to be two separate trials of 1 and 2 persons
trials %>% filter(trials$city=="Strassburg"&trials$year==1353) %>% dplyr::select(year, tried, deaths, city, record.source)
# this appears to be the same 3 people, recorded in Carlson and in Kieckhefer
trials %>% filter(trials$city=="Agen"&trials$year==1326) %>% dplyr::select(year, tried, deaths, city, record.source)
# Carlson and Kieckhefer note the same underlying sources (JM Vidal and Cohn) - this is the same trial
# And then there is the question of two sources reporting on the same trial but with different pieces of information
trials %>% filter(trials$city=="Ravensburg"&trials$year==1484) %>% dplyr::select(year, tried, deaths, city, record.source)
# Kieckhefer says 8 women tried, 2 killed, 6 released, and Midelfort shows 6 women released - are these the same event?
nrow(trials) # At this point there are 10940 observations

# Make a table that says how many entries there are in a year for a city (ignoring, at least for now, that there might be some duplicates at the county or region level) and how many sources there are.
# First, tally up the annual totals...
entries <- trials %>% 
  dplyr::select(gadm.adm1, gadm.adm2, city, year, record.source) %>% 
  arrange(city, year) %>% filter(!is.na(city)) %>%
  group_by(gadm.adm1, gadm.adm2, city, year) %>% 
  summarize(obs_ann = n(),
            sources = length(unique(record.source)))
length(which(entries$obs_ann>1)) # 609 obs with >1 entry in a year
# ...Then merge back with the main data to be able to filter for multiple annual entries as well as see what those entries are.
multipleobs <- entries %>% ungroup() %>%
  right_join(trials, by = c("gadm.adm1", "gadm.adm2", "city", "year")) %>%
  # filter to multiple rows per year for a city
  filter(obs_ann>=2&!is.na(year)) %>% arrange(gadm.adm1, gadm.adm2, city, year) %>%
  dplyr::select(city, year, tried, deaths, record.source, obs_ann, sources, decade)
# It appears that some sources list multiple trials, with potentially varying numbers of persons tried, but there are also rows that seem to be the same trial from different sources. The highest potential for problems is with entries where there is more than one source for a trial in a year.
multipleobs %>% filter(sources>1) %>% print(n = Inf) # print all rows - 140

# Check the sources for these trials and see if they have the same underlying sources or reference each other. Then remove any duplicates by subsetting the trials data where conditions (that identify duplicates) are not (!) met:
trials <- subset(trials,!( 
# -- Multiple sources, same outcomes: Agen 1326 (K/C)*D, Ettenheim 1450 (K/Mi)*D, Konstanz 1432 (K/Mi)*D, Straubing 1435 (Bh/K)*S, Regensburg 1453 (Bh/K)*D, Augsburg 1469 (Bh/K) [same twice removed to Augsburg Chronicle Cgm 2026]*D, Burgau 1580 (Bh/Mi)*D, Memmingen 1575 (Bh/Mi) [same once removed]*D, Vesoul 1599 (C/Mo)*D, Vesoul 1610 (C/Mo)*N, Paris 1404 (C/K)*D, Bailleul 1659 (C/Mb)*D, Stein am Rhein 1495 (K/Mi)*D, Reimerswaal 1450 (G/K)*S
# clear/outright dupes (e.g., same source) noted with *D
# same info (e.g., named accused) without same sources noted with *S
# Kieckhefer used Midelfort as a reference - drop K where a duplicate of M
# confirmed separate noted with *N
  trials$city=="Agen"&trials$year==1326&trials$record.source=="Carlson (2004)"|
  trials$city=="Ettenheim"&trials$year==1450&trials$record.source=="Kieckhefer (1976)"|
  trials$city=="Konstanz"&trials$year==1432&trials$record.source=="Kieckhefer (1976)"|
  trials$city=="Straubing"&trials$year==1435&trials$record.source=="Behringer (1987)"|
  trials$city=="Regensburg"&trials$year==1453&trials$record.source=="Behringer (1987)"|
  trials$city=="Augsburg"&trials$year==1469&trials$record.source=="Behringer (1987)"|
  trials$city=="Burgau"&trials$year==1580&trials$record.source=="Behringer (1987)"|
  trials$city=="Memmingen"&trials$year==1575&trials$tried==10&trials$record.source=="Behringer (1987)"|
  trials$city=="Vesoul"&trials$year==1599&trials$record.source=="Carlson (2004)"|
  trials$city=="Paris"&trials$year==1404&trials$tried==4&trials$record.source=="Carlson (2004)"|
  trials$city=="Bailleul"&trials$year==1659&trials$record.source=="Carlson (2004)"|
  trials$city=="Stein am Rhein"&trials$year==1495&trials$record.source=="Kieckhefer (1976)"|
# Agreements between Monballyu (2002) and Vanysacker (1988) are a special category of overlap because V was M's student and did a dissertation just on Bruges, and M's list cites himself for some and V for others: Bruges 1468 (Mb 1st), Bruges 1532 (V 1st), Bruges 1596 (both; V gives several more, so prefer these), Bruges 1605 (Mb cites primary only; drop V), Bruges 1612 (Mb cites pre-V; drop dupe V but keep others), Bruges 1619 (Mb is mistakenly attributed to Bruges and should be Bruges area - Brugse Vrije - not city, but easier geolocating to city; drop Mb), Bruges 1634 (4 Mb cite V, 5th is "Brugse Vrije" not Bruges; drop Mb)
  trials$city=="Bruges"&trials$year==1468&trials$record.source=="Vanysacker (1988)"|
  trials$city=="Bruges"&trials$year==1532&trials$record.source=="Monballyu (2002)"|
  trials$city=="Bruges"&trials$year==1596&trials$record.source=="Monballyu (2002)"|
  trials$city=="Bruges"&trials$year==1605&trials$record.source=="Vanysacker (1988)"|
  trials$city=="Bruges"&trials$year==1612&trials$deaths==1&trials$record.source=="Vanysacker (1988)"|
  trials$city=="Bruges"&trials$year==1634&trials$record.source=="Monballyu (2002)"|
# -- Multiple sources, one shows NA for deaths: Konstanz 1493 (K/Mi)*D, Landshut 1417 (Bh/K)*D, Augsburg 1349 (Bh/K)*D, Augsburg 1379 (Bh/K)*D, Kempten 1421 (Bh/K)*D, Nordlingen 1478 (Bh/K) [Bh dead end? to 1937 article], Wallerstein 1593 (Bh/Mi)*D, Weissenhorn 1542 (Bh/Mi)*D, Paris 1404 (C/K)*D, Toulouse 1326 (C/K)*D, Boudry 1573 (C/Mo)*D, Boudry 1575 (C/Mo)*D, Bruges 1468 (K/Mb)*N
  trials$city=="Konstanz"&trials$year==1493&trials$record.source=="Kieckhefer (1976)"|
  trials$city=="Landshut"&trials$year==1417&trials$record.source=="Behringer (1987)"|
  trials$city=="Augsburg"&trials$year==1349&trials$record.source=="Behringer (1987)"|
  trials$city=="Augsburg"&trials$year==1379&trials$record.source=="Behringer (1987)"|
  trials$city=="Kempten"&trials$year==1421&trials$record.source=="Behringer (1987)"|
#trials$city=="Nordlingen"&trials$year==1478&trials$record.source=="Behringer (1987)"|
  trials$city=="Wallerstein"&trials$year==1593&trials$record.source=="Behringer (1987)"|
  trials$city=="Weissenhorn"&trials$year==1542&trials$record.source=="Behringer (1987)"|
  trials$city=="Paris"&trials$year==1404&trials$tried==1&trials$record.source=="Carlson (2004)"|
  trials$city=="Toulouse"&trials$year==1326&trials$record.source=="Carlson (2004)"|
  trials$city=="Boudry"&trials$year==1573&trials$record.source=="Carlson (2004)"|
  trials$city=="Boudry"&trials$year==1575&trials$record.source=="Carlson (2004)"|
# -- Multiple sources, different outcomes:
# Chauny 1485 is definitely the same trial - listed in both Carlson (2004) and Kieckhefer (1976) with the same details, but data entry error for Carlson # tried
# Nice 1437 is definitely the same trial - listed in Carlson (2004) (was it updated since they did data entry?) and Kieckhefer (1976)
# Konstanz 1482 is listed in Midelfort as 48 executed in Constance and Ravensburg 1482-86 (citation Malleus); in Kieckhefer listed as 1482-6 (citation Malleus, tried by H. Institorus) 48 tried in the diocese of Constance, "including 8 mentioned below, 1484, Ravensburg"; Midelfort lists 1484 Ravensburg as 6 tried/0 died, while Kieckhefer says 8 tried/2 died, and both give Müller as source (K includes several others) - edit to: Konstanz 1482 K (remove Mi), Ravensburg 1484 K (remove Mi)
  trials$city=="Chauny"&trials$year==1485&trials$record.source=="Carlson (2004)"|
  trials$city=="Nice"&trials$year==1437&trials$record.source=="Carlson (2004)"))
nrow(trials) # Now there are 10885 rows - removed 55 duplicates

# Add: ID columns
trials <- trials %>%
  # ID for observation
  rowid_to_column(var = "obsID") %>%
  # ID for location
  # grouping in this order makes IDs in alpha order by country and within counties
  group_by(gadm.adm0, gadm.adm1, gadm.adm2, city) %>%
  mutate(siteID = cur_group_id()) %>%
  ungroup() %>%
  # a non-numeric location ID will be useful later for the distance table
  mutate(newID = paste("ID", siteID, sep="_"))

# "We use source-supplied location information to assign confessional battles geographically. Our sources supply a city location for every battle in tab ‘Battles’. Every conflict in our data set is therefore assigned to a contemporary city and country (see, columns ‘city’ and ‘country’). For the former, our data set provides latitude and longitude coordinates (see, columns ‘lat’ and ‘lon’)."
battles <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "battles.csv"), header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

###---###---###---###---###---###---###---###---###---###---
# 2. data wrangling ####
###---###---###---###---###---###---###---###---###---###---

### Don't run: {
#source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "6-create-figure-data.R"))
# figure1 <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "figure1.csv"), header=TRUE, stringsAsFactors=FALSE)
# figure2 <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "figure2.csv"), header=TRUE, stringsAsFactors=FALSE) }

### Run an updated version of script 7, which was originally written to make a country-decade panel. It pulls in the other datasets/variables for urbanization, wages, and taxes (nothing climate) to country-decade observations
source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "7-create-panel-data_updated.R")) 
# panel_dataset <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "panel_dataset.csv"), header=TRUE, stringsAsFactors=FALSE) # theirs
panel_dataset <- read.csv(here::here("Witchtrial_diffusion", "out_data", "panel_dataset.csv"), header=TRUE, stringsAsFactors=FALSE) # Ours, with cleaning/adjustments to names and duplicates - 1176 country-decade obs

### Instead of script 8, which converts point-based data to grid-based, run an updated version to preserve point structure. Keep the code that adds county centroids as lat-lon where city is unknown. L&R lat lon coordinates are WGS84/EPSG:4236, but we want LAEA for more accurate distance calculations.
source(here::here("Witchtrial_diffusion","scripts", "LR_EJ_8-create-grid-panel_updated.R"))
trials_clean <- read.csv(here::here("Witchtrial_diffusion", "out_data", "trials.csv"), header=TRUE, stringsAsFactors=FALSE)
trials_laea <- read.csv(here::here("Witchtrial_diffusion", "out_data", "trials_laea.csv"), header=TRUE, stringsAsFactors=FALSE) #9848 obs
battles_laea <- read.csv(here::here("Witchtrial_diffusion", "out_data", "battles_laea.csv"), header=TRUE, stringsAsFactors=FALSE)
# source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "8-create-grid-panel.R"))
# panel_dataset_grids <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "panel_dataset_grids.csv"), header=TRUE, stringsAsFactors=FALSE)
# source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "9-create-table-data-excel.R"))

# make sure the panel includes all years for each city - at this point, all observations are of at least 1 tried, and no observations where no trials.

###---###---###---###---###---###---###---###---###---###---
# 3. trade network position data (+ BHPR covariates) ####
###---###---###---###---###---###---###---###---###---###---

# Add data from Becker, Hsiao, Pfaff, & Rubin 2020, ongoing
tradenet_all <- read_csv(here::here("Witchtrial_diffusion","BHPR_CentralityUpdated_043022_kdsupdates.csv"), show_col_types = FALSE)
# tradenet <- tradenet_all %>% dplyr::select(bhpr_runningnr, city, currcountry, rubin_coordn, rubin_coorde, coordn_google, coorde_google, kp_latitude, kp_longitude, degree = degreeCentrality031622, betweenness = betweenCentrality031622, eigenvector = eigenCentrality031622) %>%
#   filter(!is.na(degree)) # 22 missing

# Updates made to BHPR:
# Selestat/Schlettstadt had two entries, as did Trieste/Triest and Eltvil/Eltville; we deleted the entries from KP because they had no other covariates.
# KP and Rubin lat/long coords are systematically off because they were entered from Bairoch, where they were in degrees-minutes but were subsequently digitized as degrees in decimals. Use the coords labeled "google". Using Wikipedia (mostly) and Open Street Map, we filled in missing ones. See the cross file for some notes about individual cities.
plot(tradenet_all$kp_longitude, tradenet_all$kp_latitude)
plot(tradenet_all$rubin_coorde, tradenet_all$rubin_coordn)
plot(tradenet_all$coorde_google, tradenet_all$coordn_google)

# export the names and coords, to manually match up with the trials cities
# do a little cleaning to help match where we can ahead of time
# tradenet <- tradenet %>%
#   mutate(city = str_replace_all(city, "_", " ")) %>% #replace all _'s
#   mutate(city = str_to_title(city))
# 
# city_cross <- trials_clean %>%
#   dplyr::select(city, gadm.adm2, gadm.adm1, gadm.adm0, lat, lon) %>%
#   distinct %>%
#   merge(tradenet, by="city", all=TRUE)
# 
# city_cross %>% filter(!is.na(degree)&!is.na(gadm.adm0)) %>% nrow
# # 145 shared? do some manual matching...
# write_csv(city_cross, here::here("Witchtrial_diffusion", "out_data", "city_cross.csv"), na="")

tradenet_all <- tradenet_all %>%
  # BHPR use underscores when a city name is multiple words
  mutate(city = str_replace_all(city, "_", " ")) %>% #replace all _'s
  # Capitalize all words in a name (BHPR is all lowercase)
  mutate(city = str_to_title(city)) %>%
  # Correct capitalization of prepositions
  mutate(city = str_replace_all(city, " Am ", " am ")) %>%
  mutate(city = str_replace_all(city, " An ", " an ")) %>%
  mutate(city = str_replace_all(city, " Im ", " im ")) %>%
  mutate(city = str_replace_all(city, " Der ", " der ")) %>%
  mutate(city = str_replace_all(city, " Unter ", " unter ")) %>%
  mutate(city = str_replace_all(city, " Ob ", " ob "))

# name errors found in matching
tradenet_all$city[tradenet_all$city=="Arnstatdt"] <- "Arnstadt"
tradenet_all$city[tradenet_all$city=="Segeberg"] <- "Segeburg"
tradenet_all$city[tradenet_all$city=="Eltvil"] <- "Eltville"
tradenet_all$city[tradenet_all$city=="Harderwuk"] <- "Harderwijk"
tradenet_all$city[tradenet_all$city=="Iena"] <- "Jena"
tradenet_all$city[tradenet_all$city=="Aschaffenberg"] <- "Aschaffenburg"

# some names should not match
tradenet_all$city[tradenet_all$city=="Reichenberg"] <- "Liberec"
# some names should match
tradenet_all$city[tradenet_all$city=="Baden"] <- "Baden-Baden"
tradenet_all$city[tradenet_all$city=="Brno"] <- "Brunn"
tradenet_all$city[tradenet_all$city=="Eekloo"] <- "Eeklo"
tradenet_all$city[tradenet_all$city=="Kirchheim"] <- "Kirchheim unter Teck" # see notes on this one!
tradenet_all$city[tradenet_all$city=="Luxemburg"] <- "Luxembourg"
tradenet_all$city[tradenet_all$city=="Noerdlingen"] <- "Nordlingen"
tradenet_all$city[tradenet_all$city=="Offenbach"] <- "Offenbach am Main"
tradenet_all$city[tradenet_all$city=="Oostende"] <- "Ostend"
tradenet_all$city[tradenet_all$city=="Schwaebisch Gmund"] <- "Schwabisch Gmund"
tradenet_all$city[tradenet_all$city=="Schwaebisch Hall"] <- "Schwabisch Hall"
tradenet_all$city[tradenet_all$city=="Selestat"] <- "Schlettstadt"
tradenet_all$city[tradenet_all$city=="Strasbourg"] <- "Strassburg"
tradenet_all$city[tradenet_all$city=="Tuebingen"] <- "Tubingen"
tradenet_all$city[tradenet_all$city=="Ueberlingen"] <- "Uberlingen"
tradenet_all$city[tradenet_all$city=="Wuerzburg"] <- "Wurzburg"
# a few names end up a little funny (eg Bourg En Bresse should be Bourg-en-Bresse)

trials_clean %>%
  dplyr::select(city, gadm.adm2, gadm.adm1, gadm.adm0) %>%
  distinct %>%
  merge(tradenet_all, by="city", all.x=FALSE, all.y=TRUE) %>%
  filter(!is.na(degreeCentrality031622)&!is.na(gadm.adm0)) %>%
  nrow
# 151 matching cities, but only 150 have centrality measures

trials_net <- tradenet_all %>%
  # all.y to keep multiple matches from trial records
  merge(trials_clean, by="city", all.x=TRUE, all.y=TRUE) %>% 
  # but filter out cities not in the network by removing obs without id
  filter(!is.na(bhpr_runningnr)) 

# Now consider whether they have the same geographic scope

city_trials_table <- trials_clean %>%
  filter(year>=1400&year<=1650) %>%
  group_by(newID) %>% 
  summarize(tried1400_1650 = sum(tried),
            deaths1400_1650 = sum(deaths))
city_trials_table <- trials_clean %>%
  dplyr::select(newID, city, gadm.adm2, gadm.adm1, gadm.adm0, lat, lon) %>%
  distinct %>%
  merge(city_trials_table, by="newID", all=TRUE) %>%
  dplyr::select(-newID)

tradenet_all <- city_trials_table %>% 
  filter(!is.na(city)) %>%
  dplyr::select(-lat, -lon) %>%
  merge(x=tradenet_all,y=., by="city", all.x = TRUE, all.y = FALSE) %>%
  replace_na(list(tried1400_1650=0, deaths1400_1650=0))

# for descriptive results by city, and to share back to BHPR
write_csv(tradenet_all, here::here("Witchtrial_diffusion", "BHPR_CentralityUpdated_043022_namematched.csv"), na="")

# add text ID, will be helpful for later calculations
# (overwrites newID column from trials data)
# also do some housekeeping
trials_net <- trials_net %>%
  mutate(newID = paste("BHPR", bhpr_runningnr, sep="_")) %>%
  dplyr::select(bhpr_runningnr, newID, city, 
                currcountry, coorde_google, coordn_google,
                obsID, siteID, gadm.adm2, gadm.adm1, gadm.adm0,
                year, decade, century, tried, deaths, record.source,
                prot1530, press, pop1500, logpop1500, indepcity, 
                univ1450, bishop,  laymag, marketpot1500, market_pot_google,
                water, hanseatic, territory,
                electorate, upperrhencirc, lowersaxoncirc, swabiancirc, franccirc,
                westphalcirc, bavariancirc, bohemiancirc, austriancirc, italianhre,
                hredefacto, hredejure, luther_ustc_pre1523, luther_visits_1522,
                degree=degreeCentrality031622, between=betweenCentrality031622, 
                eigen=eigenCentrality031622)

# Convert to Euro equal area projection - already did this for trials and battles, but the cities in the trade network need it.
# This will also filter out any cities where we could not match the lat/long in Google etc.
# Make events into point shapefiles.

trials_net_coords  <- trials_net %>% filter(!is.na(coorde_google)) %>% dplyr::select(coorde_google, coordn_google)
trials_net_spdf <- trials_net %>%
  filter(!is.na(coorde_google)) %>%
  SpatialPointsDataFrame(coords = trials_net_coords, 
                         data   = .,
                         proj4string = CRS("+init=epsg:4326"))
# crs_EPSG3035 <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs" # units in meters, not degrees
trials_net_spdf  %<>% spTransform(CRS("+init=epsg:3035"))
trials_net_laea <- trials_net_spdf %>% 
  data.frame() %>% # extract all the data to a non-spatial object
  dplyr::select(-optional) %>% # drop extra column
  rename(lon=coorde_google.1, lat=coordn_google.1) # rename the EPSG3035 columns as lon and lat

write_csv(trials_net_laea, here::here("Witchtrial_diffusion", "out_data", "trials_net_laea.csv"))
# trials_net_laea <- read_csv(here::here("Witchtrial_diffusion", "out_data", "trials_net_laea.csv"))

### Construct the panel. ----
#Some cities had more than one trial in a year, and we did a one-to-many match with the trade network cities, which means duplicate annual observations
panel_dataset_net <- trials_net_laea %>%
  dplyr::select(bhpr_runningnr, year, tried, deaths) %>%
  group_by(bhpr_runningnr, year) %>%
  summarize(tried = sum(tried), deaths=sum(deaths)) %>% ungroup()

# trials_net %>% filter(!is.na(decade)) %>% nrow()
# trials_net %>% filter(!is.na(year)) %>% nrow()
# 1910 with known decade; 1876 with known year (of 2346 total)
# We lose about 500 trials by requiring the year to be known.
  
# Unlike for the trials-location based data, this panel should be balanced.
# That is, each city should be included for each year in the study period.
# Before merging the trials and the other data, pad out the trials 
# with 0 trials for years with no trials.
panel_dataset_net <- trials_net_laea %>%
  dplyr::select(-tried, -deaths, -record.source) %>% # remove year-varying cols
  distinct() %>% # make distinct rows, one for each city-year
  right_join(panel_dataset_net, by=c("bhpr_runningnr", "year")) %>%
  arrange(year) %>%
  # fill in non-included years so that moving sums are correct
  complete(nesting(newID), year = seq(1300, 1700, 1L)) %>%
  group_by(newID) %>%
  fill(bhpr_runningnr, city, currcountry, coordn_google, coorde_google, 
       gadm.adm2, gadm.adm1, gadm.adm0, siteID, lon, lat,
       prot1530, press, pop1500, logpop1500, indepcity, 
       univ1450, bishop,  laymag, marketpot1500, market_pot_google,
       water, hanseatic, territory,
       electorate, upperrhencirc, lowersaxoncirc, swabiancirc, franccirc,
       westphalcirc, bavariancirc, bohemiancirc, austriancirc, italianhre,
       hredefacto, hredejure, luther_ustc_pre1523, luther_visits_1522,
       degree, between, eigen, .direction="downup") %>%
  ungroup() %>%
  # some trials with year NA, which make it through the merge, so filter those
  # and confirm the temporal scope restriction
  filter(!is.na(year)&year<1700) %>%
  # Because of balancing out the city network across years, there are NAs
  # in trials and deaths. Fill 0's!
  replace_na(list(tried=0, deaths=0)) %>%
  # more housekeeping: recode decade/century here
  mutate(decade = as.numeric(str_extract(year, "[0-9]{3}"))*10,
       century = as.numeric(str_extract(year, "[0-9]{2}"))*100)

###---###---###---###---###---###---###---###---###---###---
# 4. Government capacity ####
###---###---###---###---###---###---###---###---###---###---

# Hanse membership - 
# Dollinger, P. (1970) The German Hansa. London: Macmillan.
dollinger <- read_csv(here::here("Witchtrial_diffusion","Dollinger_GermanHansa_clean.csv"), show_col_types = FALSE)

#  Pagel, K. (1983) Die Hanse. Braunschweig, Germany: Westermann.
pagel <- read_csv(here::here("Witchtrial_diffusion","Pagel_DieHanse_clean.csv"), show_col_types = FALSE)

# Free or imperial status - 
# Jacob, M. (2010) “Long-term persistence: The Free and Imperial City Experience.” https://doi.org/10.2139/ssrn.1616973
# gives year
jacob <- read_csv(here::here("Witchtrial_diffusion","Jacob_PersistenceGermanCities_clean.csv"), show_col_types = FALSE)

# The coding from Rubin REStat 2014 is already in the BHPR data 
# It is not time-variant

# Diocesan seats -
# Cheney, D. M. (2017) The Hierarchy of the Catholic Church. Accessed in July-August, 2022, http://catholic-hierarchy.org.
cheney <- read_csv(here::here("Witchtrial_diffusion","Cheney_Catholic-Hierarchy_clean.csv"), show_col_types = FALSE) %>% filter(`1000to1700`==1)

# export the city names, to manually match them up with the BHPR cities
# city_cross_cap <-
#   read_csv(here::here("Witchtrial_diffusion","BHPR_CentralityUpdated_043022_kdsupdates.csv"), show_col_types = FALSE) %>%
#   dplyr::select(currcountry, city, bhpr_runningnr) %>% distinct() %>%
#   mutate(city = str_replace_all(city, "_", " ")) %>% #replace all _'s
#   mutate(city = str_to_title(city)) %>%
#   # Correct capitalization of prepositions
#   mutate(city = str_replace_all(city, " Am ", " am ")) %>%
#   mutate(city = str_replace_all(city, " An ", " an ")) %>%
#   mutate(city = str_replace_all(city, " Im ", " im ")) %>%
#   mutate(city = str_replace_all(city, " Der ", " der ")) %>%
#   mutate(city = str_replace_all(city, " Unter ", " unter ")) %>%
#   mutate(city = str_replace_all(city, " Ob ", " ob ")) %>%
#   # merge in each supplemental file
#   merge(distinct(dollinger[,c("Region","City","id")]), by.x="city", by.y="City",
#         suffixes = c(".bhpr",".doll"), all=TRUE) %>%
#   merge(distinct(pagel[,c("Region","City", "id")]), by.x="city", by.y="City",
#         suffixes = c(".doll",".pagel"), all=TRUE) %>%
#   merge(distinct(jacob[,c("Panel","City", "id")]), by.x="city", by.y="City",
#         all=TRUE) %>%
#   merge(distinct(cheney[,c("Country","PlaceName", "ID")]),
#         by.x="city", by.y="PlaceName", all=TRUE)
# # to do some manual matching...
# write_csv(city_cross_cap, here::here("Witchtrial_diffusion", "out_data", "city_cross_cap.csv"), na="")
# (corrections have been made since this was first created)

# harmonize names! to match the trials-BHPR ones
pagel$City[pagel$City=="Arnheim)"] <- "Arnhem"
jacob$City[jacob$City=="Baden)"] <- "Baden-Baden"
pagel$City[pagel$City=="Berlin-Koelln)"] <- "Berlin"
jacob$City[jacob$City=="Biberach (Riss)"] <- "Biberach"
cheney$PlaceName[cheney$PlaceName=="Bourg-en-Bresse"] <- "Bourg En Bresse"
pagel$City[pagel$City=="Koeln)"] <- "Cologne" 
cheney$PlaceName[cheney$PlaceName=="Koeln)"] <- "Cologne"
dollinger$City[dollinger$City=="Kammin"] <- "Cammin"
pagel$City[pagel$City=="Kammin"] <- "Cammin"
jacob$City[jacob$City=="Frankenberg (Eder)"] <- "Frankenberg"
jacob$City[jacob$City=="Frankfurt (Main)"] <- "Frankfurt am Main"
dollinger$City[dollinger$City=="Frankfurt-on-the-Oder"] <- "Frankfurt an der Oder" 
pagel$City[pagel$City=="Frankfurt a. O."] <- "Frankfurt an der Oder" 
jacob$City[jacob$City=="Frankfurt (Oder)"] <- "Frankfurt an der Oder"
jacob$City[jacob$City=="Fuerstenwalde"] <- "Furstenwalde"
jacob$City[jacob$City=="Guestrow"] <- "Gustrow"
jacob$City[jacob$City=="Halle (Saale)"] <- "Halle"
pagel$City[pagel$City=="Hannover"] <- "Hanover"
jacob$City[jacob$City=="Bad Hersfeld"] <- "Hersfeld"
jacob$City[jacob$City=="Hof (Seele)"] <- "Hof"
jacob$City[jacob$City=="Kempten (Allgaeu)"] <- "Kempten"
jacob$City[jacob$City=="Kirchheim u. Teck"] <- "Kirchheim unter Teck"
dollinger$City[dollinger$City=="Kolberg"] <- "Kolobrzeg" 
pagel$City[pagel$City=="Kolberg"] <- "Kolobrzeg"
dollinger$City[dollinger$City=="Koenigsberg"] <- "Konigsberg"
pagel$City[pagel$City=="Koenigsberg"] <- "Konigsberg"
dollinger$City[dollinger$City=="Koeslin"] <- "Koslin" 
pagel$City[pagel$City=="Koeslin"] <- "Koslin"
jacob$City[jacob$City=="Koethen"] <- "Kothen"
jacob$City[jacob$City=="Bad Kreuznach"] <- "Kreuznach"
jacob$City[jacob$City=="Landau (Pfalz)"] <- "Landau"
jacob$City[jacob$City=="Bad Langensalza"] <- "Langensalza"
jacob$City[jacob$City=="Lindau (Bodensee)"] <- "Lindau"
jacob$City[jacob$City=="Marburg"] <- "Marburg Hessen"
jacob$City[jacob$City=="Muelheim (Rurhr)"] <- "Muelheim"
dollinger$City[dollinger$City=="Muehlhausen (Thuringia)"] <- "Mulhausen Thr"
pagel$City[pagel$City=="Muehlhausen"] <- "Mulhausen Thr" 
jacob$City[jacob$City=="Muehlhausen"] <- "Mulhausen Thr"
jacob$City[jacob$City=="Naumburg (Saale)"] <- "Naumburg"
pagel$City[pagel$City=="Nimwegen"] <- "Nijmegen"
jacob$City[jacob$City=="Nuernberg"] <- "Nuremberg"
jacob$City[jacob$City=="Osterode am Harz"] <- "Osterode"
cheney$PlaceName[cheney$PlaceName=="s-Hertogenbosch"] <- "S Hertogenbosch"
jacob$City[jacob$City=="Saarbruecken"] <- "Saarbrucken"
cheney$PlaceName[cheney$PlaceName=="Saint-Omer"] <- "Saint Omer"
jacob$City[jacob$City=="Schwabisch Gmuend"] <- "Schwabisch Gmund"
dollinger$City[dollinger$City=="Dorpat"] <- "Tartu" 
pagel$City[pagel$City=="Dorpat"] <- "Tartu"
jacob$City[jacob$City=="Villingen-Schwenningen"] <- "Villingen"
jacob$City[jacob$City=="Wolfenbuettel"] <- "Wolfenbuttel"
cheney$PlaceName[cheney$PlaceName=="Wurzburg"] <- "Wuerzburg"
pagel$City[pagel$City=="Zutfen"] <- "Zutphen"

# some of these include temporally-specific variables and need to be turned into panels: 
# Pagel, on when a city was part of the Hanse or not
# Jacob, on when a city gained and lost independence
# Cheney, on when a city was an archbishopric/bishopric
# each needs to match the time-series dimensions of the main panel

# Pagel, on when a city was part of the Hanse or not: Entry, Exit
pagel_panel <- pagel %>%
  mutate(Exit = replace_na(Exit, 1669)) %>%
  mutate(year = Map(seq, Entry, Exit)) %>% # inclusive!
  unnest(year) %>%
  mutate(hanse_pagel = 1) %>%
  dplyr::select(id, Region, City, year, hanse_pagel) %>%
  # fill in non-included years to match the main panel dimensions
  complete(nesting(id), year = seq(1300, 1700, 1L)) %>%
  group_by(id) %>%
  fill(id, Region, City, .direction="downup") %>%
  ungroup() %>%
  mutate(hanse_pagel = replace_na(hanse_pagel, 0)) %>%
  filter(year>1300&year<1700) %>% #trim the temporal ends again
  rename(id_pagel=id, region_pagel=Region, city=City)

# Jacob, on when a city gained and lost independence: "Independence begin", "Independence end"
jacob_panel <- jacob %>%
  dplyr::select(id, Panel, City, freeimp_jacob = `Free/Imperial`, `Independence begin`, `Independence end`) %>%
  # inclusive expansion; if 0 for NAs, which will be filtered out later
  mutate(year = Map(seq, `Independence begin`, `Independence end`)) %>% 
  unnest(year) %>%
  dplyr::select(id, Panel, City, year, freeimp_jacob) %>%
  mutate(freeimp_jacob = ifelse(year!=0, 1, 0)) %>% # add to sequenced years
  # fill in non-included years to match the main panel dimensions
  complete(nesting(id), year = seq(1300, 1700, 1L)) %>%
  group_by(id) %>%
  fill(id, Panel, City, .direction="downup") %>%
  ungroup() %>%
  mutate(freeimp_jacob = replace_na(freeimp_jacob, 0)) %>%
  filter(year>1300&year<1700) %>% #trim the temporal ends again
  rename(id_jacob=id, region_jacob=Panel, city=City)

# Cheney, on when a city was an archbishopric/bishopric: StartDate, EndDate, Diocese, Archdiocese
cheney_panel <- cheney %>%
  mutate(year = Map(seq, StartYear, EndYear)) %>% # inclusive!
  unnest(year) %>%
  dplyr::select(ID, Country, PlaceName, year, dio_cheney = Diocese, archdio_cheney = Archdiocese) %>%
  # fill in non-included years to match the main panel dimensions
  complete(nesting(ID), year = seq(1300, 1700, 1L)) %>%
  group_by(ID) %>%
  fill(ID, Country, PlaceName, dio_cheney, archdio_cheney, .direction="downup") %>%
  ungroup() %>%
  mutate(dio_cheney = replace_na(dio_cheney, 0),
         archdio_cheney = replace_na(archdio_cheney, 0)) %>%
  filter(year>1300&year<1700) %>% #trim the temporal ends again
  rename(id_cheney=ID, country_cheney=Country, city=PlaceName)

# Dollinger just needs a col that says he names that city as Hanse
dollinger <- dollinger %>%
  mutate(hanse_dollinger = 1) %>%
  rename(id_dollinger=id, region_dollinger=Region, city=City, 
         hanse_doll_rank=Indication)

panel_dataset_net <- panel_dataset_net %>%
  merge(dollinger, by.x="", by.y="city",
        suffixes = c(".bhpr",".doll"), all.x=T, all.y=F) %>%
  merge(pagel_panel, by.x=c("year","city"), by.y=c("year","city"),
        suffixes = c(".doll",".pagel"), all.x=T, all.y=F) %>%
  merge(jacob_panel, by.x=c("year","city"), by.y=c("year","city"),
        all.x=T, all.y=F) %>%
  merge(cheney_panel,by.x=c("year","city"), by.y=c("year","city"), 
        all.x=T, all.y=F) %>%
# fill 0's for NAs when a city wasn't in the supplements:
  mutate(across(c(hanse_dollinger, hanse_pagel, freeimp_jacob, dio_cheney, archdio_cheney), ~replace_na(.,0)))

###---###---###---###---###---###---###---###---###---###---
# 5. better climate data ####
###---###---###---###---###---###---###---###---###---###---

# source(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "code", "5-create-temperature.R"))
# temperature <- read.csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019", "data", "clean", "temperature.csv"), header=TRUE, stringsAsFactors=FALSE)

# Instead of Oster/Leeson & Russ temperature/weather data, use CCSM4.0 climate data and our own script!
# We can't use climate deviation reprojections from Guiot, Corona, & ESCARSEL Members (2010) - often used by economists - because it doesn't cover the Alps - missing for lots of our cases. It's also not great for our purposes, since they created values at gridded locations rather than a raster (surface of values). An alternative is the Community Climate Systems Model, 850-1850 (CCSM 4.0), which is better data (it's the standard for climate scientists) because it has monthly values in a surface for the whole world. In our script, rely on materials from N. Gauthier and rasterbricks he prepared.

# source(here::here("Witchtrial_diffusion","scripts","Climate_prep_2022-05-15.R"))
climate_net <- read_csv(here::here("Witchtrial_diffusion", "out_data", "CCSM_climate_net_annual.csv"))

# lag climate measures - use 1 year
climate_net <- climate_net %>%
  group_by(newID) %>%
  arrange(year) %>% # order within city by year, so lags will be correct
  mutate(prc_lag1 = lag(prc), 
         temp_lag1 = lag(temp)) %>% # lag 1 year
  # look back at previous
  # come back and adjust the previous period length (15 right now) based on archy lit?
  # first calculate the relative mean
  mutate(prc_mean_lag = rollapplyr(prc_lag1, width=list(-1:-60), mean,
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"), 
         temp_mean_lag = rollapplyr(temp_lag1, 60, mean, 
                                    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>% 
  # housekeeping! produced some NaN's taking means where all NAs
  # mutate(prc_mean_lag = na_if(prc_mean_lag, NaN),
  #        temp_mean_lag = na_if(temp_mean_lag, NaN)) %>%
  # now relative standard deviation around that mean 
  mutate(prc_sd_lag = rollapplyr(prc_lag1, width=list(-1:-60), sd,
                                 na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         temp_sd_lag = rollapplyr(temp_lag1, 60, sd,
                                  na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  # now values relative to the mean, and how they compare vis a vis sd
  # these are all based on temp reprojections for prior 15 years, might not be the most appropriate
  # If "shock" is evaluated as postive OR negative, then 
  # using a 10-year reference group, half of years are "shocks"! About 1/3 are above and about 1/3 are below, for prc and temp
  # using a 15-year reference group,
  mutate(prc_adj_lag = prc_lag1 - prc_mean_lag,
         temp_adj_lag = temp_lag1 - temp_mean_lag,
         prc_shock_hi = if_else(prc_lag1>(prc_mean_lag+(2*prc_sd_lag)), 1,0),
         prc_shock_lo = if_else(prc_lag1<(prc_mean_lag-(2*prc_sd_lag)), 1,0),
         prc_shock = ifelse(prc_shock_hi==1|prc_shock_lo==1, 1,0),
         temp_shock_hi = if_else(temp_lag1>temp_mean_lag+2*temp_sd_lag, 1,0),
         temp_shock_lo = if_else(temp_lag1<temp_mean_lag-2*temp_sd_lag, 1,0),
         temp_shock = ifelse(temp_shock_hi==1|temp_shock_lo==1, 1,0)) %>%
  ungroup()

### merge in the climate data! 
  
panel_dataset_net <- panel_dataset_net %>%
  merge(climate_net, by.x=c("newID", "year"), by.y=c("newID", "year"), all.x=TRUE, all.y=FALSE)

saveRDS(panel_dataset_net, here::here("Witchtrial_diffusion", "out_data", "panel_net_updated.rds"))
# panel_dataset_net <- read_rds(here::here("Witchtrial_diffusion", "out_data", "panel_net_updated.rds"))

###---###---###---###---###---###---###---###---###---###---
# 6. add USTC demonology printing data to the R environment ####
###---###---###---###---###---###---###---###---###---###---

# location info was recorded using EPSG:3035 (LAEA) from epsg.io: if included in trials data, then coords are LAEA transformations of trials coords; if not, then recorded based on search by name with manual adjustments based on the visual map

# only Hexenhammer/Malleus maleficarum ---###---
malleus <- read.csv(here::here("Witchtrial_diffusion", "Malleus_USTC_Data-entry.csv"), header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

# Later steps will use printing location names for looking up distances. These do not need to be consistent with city names in the trials data, but it is good practice to standardize anyway.
# setdiff(unique(malleus$Location), unique(trials$city))
# setdiff(unique(malleus$StdName), unique(trials$city)) # more matches here
# trials %>% dplyr::select(city) %>% filter(str_detect(city, "Frank"))
malleus$StdName[malleus$Location=="Frankfurt am Main"] <- "Frankfurt am Main"
# trials %>% dplyr::select(city) %>% filter(str_detect(city, "Stras"))
malleus$StdName[malleus$Location=="Strasbourg"] <- "Strassburg"
# Speyer is the only city in MM but not trials data

# All major demonology texts ---###---
demonology <- read.csv(here::here("Witchtrial_diffusion", "Demonology_USTC_Data-entry.csv"), header=TRUE, encoding = "latin1", stringsAsFactors=FALSE)

# Later steps will use printing location names for looking up distances. These do not need to be consistent with city names in the trials data, but it is good practice to standardize anyway.
# setdiff(unique(demonology$Location), unique(panel_dataset_net$city))
# setdiff(unique(demonology$StdName), unique(panel_dataset_net$city)) # more matches here
# trials %>% dplyr::select(city) %>% filter(str_detect(city, "Frank"))
demonology$StdName[demonology$Location=="Frankfurt am Main"] <- "Frankfurt am Main"
# trials %>% dplyr::select(city) %>% filter(str_detect(city, "Stras"))
demonology$StdName[demonology$Location=="Strasbourg"] <- "Strassburg"

# a summary table by printing city
demonology %>%
  group_by(StdName) %>%
  summarize(editions = n(),
            copies = sum(PresentCopies))

# create a summary table by text
texts.table <- demonology %>%
  group_by(ShortTitle) %>%
  summarize(editions = n(),
            copies = sum(PresentCopies),
            first = min(Year),
            last = max(Year),
            cities = length(unique(StdName))) %>%
  rbind(c("Total", nrow(demonology), sum(demonology$PresentCopies), min(demonology$Year), max(demonology$Year), length(unique(demonology$StdName))))

write.csv(texts.table, here::here("Witchtrial_diffusion", "out_tables", "texts_table.csv"), row.names=TRUE)

###---###---###---###---###---###---###---###---###---###---
### 7. Temporal diffusion ------------
###---###---###---###---###---###---###---###---###---###---

# L&R use 1500-1700 and 1520-1770 for their analyses. Malleus malificarum was first printed in 1487, and USTC originally went through 1650 . Formicarius was first printed in 1473. With USTC extension to 1700, do merging and calculations for trials up to 1700. Analyses use subsets, up to 1650, because this was our original approach.

battles_matrix <- read_rds(here::here("Witchtrial_diffusion", "out_data", "battles_matrix.rds")) # created in script file 8

panel_dataset_net <- panel_dataset_net %>%
  left_join(battles_matrix[battles_matrix$year<1700,], by = c("year"="year")) %>% # keep all rows of x (main df), add y (sums)
  arrange(year, bhpr_runningnr)

# 7.1. calculate temporal diff of witch-hunting ideas ----
# Make a matrix by year of annual and cumulative totals of Malleus maleficarum editions
# NA filled where there was no prev value to lag
# and when the rolling sum included an NA input

malleus_matrix <- malleus %>%
  group_by(Year) %>% 
  summarize(MM_ann = n()) %>% # total editions in a year
  # before lag, make sure all years included
  complete(Year=1300:1700) %>% # go to 1700 to match updated USTC
  arrange(Year) %>%
  replace_na(list(MM_ann = 0)) %>%
  mutate(MM_lag1=lag(MM_ann, n=1, default = NA)) %>% # lag by 1 year
  mutate(MM_sum5 = rollapplyr(MM_ann, width=list(-1:-5), sum, # 5 year moving sum
                                na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(MM_sum10 = rollapplyr(MM_ann, width=list(-1:-10), sum, # 10 year moving sum
                                 na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(MM_sum20 = rollapplyr(MM_ann, width=list(-1:-20), sum, # 20 year moving sum
                                 na.rm=TRUE, fill=NA, align="right"))

# Add list columns of the place names of MM editions and the copy counts in time t and t-1:
malleus_matrix <- malleus %>%
  dplyr::select(Year, StdName, PresentCopies) %>%
  # names will be used as column names in the distance matrix, so replace any spaces (e.g., in Frankfurt am Main) with "."
  mutate(StdName = str_replace_all(StdName, " ",".")) %>%
  group_by(Year) %>%
  nest(., MMPrinters_0 = c(StdName), MMCopies_0 = c(PresentCopies)) %>%
  full_join(malleus_matrix, by="Year") %>% # join lists and sums together
  ungroup() %>%
  arrange(Year) %>%
  mutate(MMPrinters_1=lag(MMPrinters_0, n=1, default = list(NULL)),
         MMCopies_1=lag(MMCopies_0, n=1, default = list(NULL))) # lag by 1 year

# flatten from tibble to list
MMPrinters_1 <- vector("list", dim(malleus_matrix)[1])
for (i in 1:dim(malleus_matrix)[1]){
  flat.ids <- malleus_matrix$MMPrinters_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    MMPrinters_1[[i]] <- Reduce(rbind, flat.ids)
  } else {MMPrinters_1[[i]] <- list(NULL)}
}
malleus_matrix$MMPrinters_1 <- MMPrinters_1

MMCopies_1 <- vector("list", dim(malleus_matrix)[1])
for (i in 1:dim(malleus_matrix)[1]){
  flat.ids <- malleus_matrix$MMCopies_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    MMCopies_1[[i]] <- Reduce(rbind, flat.ids)
  } else {MMCopies_1[[i]] <- list(NULL)}
}
malleus_matrix$MMCopies_1 <- MMCopies_1

# merge MM edition calculations into the main data (1300-1700 CE)

panel_dataset_net <- panel_dataset_net %>%
  left_join(malleus_matrix, by = c("year"="Year")) %>% # keep all rows of x (main df), add y (sums)
  arrange(year, bhpr_runningnr)

# Same thing, but now for all demonological texts in USTC ---###---

texts_matrix <- demonology %>%
  group_by(Year) %>% 
  summarize(texts_ann = n()) %>% # total editions in a year
  # before lag, make sure all years included
  complete(Year=1300:1700) %>% # max USTC
  arrange(Year) %>%
  replace_na(list(texts_ann = 0)) %>%
  mutate(texts_lag1=lag(texts_ann, n=1, default = NA)) %>% # lag by 1 year
  mutate(texts_sum5 = rollapplyr(texts_ann, width=list(-1:-5), sum, # 5 year moving sum
                              na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(texts_sum10 = rollapplyr(texts_ann, width=list(-1:-10), sum, # 10 year moving sum
                               na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(texts_sum20 = rollapplyr(texts_ann, width=list(-1:-20), sum, # 20 year moving sum
                               na.rm=TRUE, fill=NA, align="right"))

texts_matrix <- demonology %>%
  filter(!ShortTitle %in% c("Malleus maleficarum")) %>%
  group_by(Year) %>% 
  summarize(texts_not_ann = n()) %>% # total non-MM editions in a year
  complete(Year=1300:1700) %>% # max USTC
  arrange(Year) %>%
  replace_na(list(texts_not_ann = 0)) %>%
  mutate(texts_not_lag1=lag(texts_not_ann, n=1, default = NA)) %>% 
  mutate(texts_not_sum5 = rollapplyr(texts_not_ann, width=list(-1:-5), sum, 
                                 na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(texts_not_sum10 = rollapplyr(texts_not_ann, width=list(-1:-10), sum, #
                                  na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(texts_not_sum20 = rollapplyr(texts_not_ann, width=list(-1:-20), sum, 
                                  na.rm=TRUE, fill=NA, align="right")) %>%
  full_join(texts_matrix, by="Year") # join all and non-MM together

# Add list columns of the place names of editions and the copy counts in time t and t-1:
texts_matrix <- demonology %>%
  dplyr::select(Year, StdName, PresentCopies) %>%
  # names will be used as column names in the distance matrix, so replace any spaces (e.g., in Frankfurt am Main) with "."
  mutate(StdName = str_replace_all(StdName, " ",".")) %>%
  group_by(Year) %>%
  nest(., AllPrinters_0 = c(StdName), AllCopies_0 = c(PresentCopies)) %>%
  full_join(texts_matrix, by="Year") %>% # join lists and sums together
  ungroup() %>%
  arrange(Year) %>%
  mutate(AllPrinters_1=lag(AllPrinters_0, n=1, default = list(NULL)),
         AllCopies_1=lag(AllCopies_0, n=1, default = list(NULL))) # lag by 1 year
texts_matrix <- demonology %>%
  filter(!ShortTitle %in% c("Malleus maleficarum")) %>%
  dplyr::select(Year, StdName, PresentCopies) %>%
  mutate(StdName = str_replace_all(StdName, " ",".")) %>%
  group_by(Year) %>%
  nest(., AllPrinters_not_0 = c(StdName), AllCopies_not_0 = c(PresentCopies)) %>%
  full_join(texts_matrix, by="Year") %>%
  ungroup() %>%
  arrange(Year) %>%
  mutate(AllPrinters_not_1=lag(AllPrinters_not_0, n=1, default = list(NULL)),
         AllCopies_not_1=lag(AllCopies_not_0, n=1, default = list(NULL)))

# flatten from tibble to list
AllPrinters_1 <- vector("list", dim(texts_matrix)[1])
for (i in 1:dim(texts_matrix)[1]){
  flat.ids <- texts_matrix$AllPrinters_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    AllPrinters_1[[i]] <- Reduce(rbind, flat.ids)
  } else {AllPrinters_1[[i]] <- list(NULL)}
} # ignore warnings re: "number of items to replace is not a multiple of replacement length" - runs fine with expected results
texts_matrix$AllPrinters_1 <- AllPrinters_1
AllCopies_1 <- vector("list", dim(texts_matrix)[1])
for (i in 1:dim(texts_matrix)[1]){
  flat.ids <- texts_matrix$AllCopies_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    AllCopies_1[[i]] <- Reduce(rbind, flat.ids)
  } else {AllCopies_1[[i]] <- list(NULL)}
}
texts_matrix$AllCopies_1 <- AllCopies_1

# non-MM
AllPrinters_not_1 <- vector("list", dim(texts_matrix)[1])
for (i in 1:dim(texts_matrix)[1]){
  flat.ids <- texts_matrix$AllPrinters_not_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    AllPrinters_not_1[[i]] <- Reduce(rbind, flat.ids)
  } else {AllPrinters_not_1[[i]] <- list(NULL)}
}
texts_matrix$AllPrinters_not_1 <- AllPrinters_not_1
AllCopies_not_1 <- vector("list", dim(texts_matrix)[1])
for (i in 1:dim(texts_matrix)[1]){
  flat.ids <- texts_matrix$AllCopies_not_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    AllCopies_not_1[[i]] <- Reduce(rbind, flat.ids)
  } else {AllCopies_not_1[[i]] <- list(NULL)}
}
texts_matrix$AllCopies_not_1 <- AllCopies_not_1

# merge edition calculations into the main data; continue to preserve all years to keep all trials obs

panel_dataset_net <- panel_dataset_net %>%
  left_join(texts_matrix, by = c("year"="Year")) %>% # keep all rows of x (main df), add y (sums)
  arrange(bhpr_runningnr, year)

# 7.2. calculate temporal diff of witch-hunting activities ----
# Make a matrix by year of annual and cumulative totals of trials
# NA filled where there was no prev value to lag
# and when the rolling sum included an NA input

trials_net_matrix <- panel_dataset_net %>%
  group_by(year) %>% 
  summarize(nobs = sum(ifelse(tried>0,1,0)),
            tried_ann = sum(tried),
            deaths_ann = sum(deaths)) %>% # some NA obs
  # before lag, make sure to include all years
  complete(year=1300:1700) %>%
  replace_na(list(nobs = 0, tried_ann = 0, deaths_ann = 0)) %>%
  arrange(year) %>%
  mutate(tried_lag1=lag(tried_ann, n=1, default = NA)) %>% # lag by 1 year
  mutate(tried_sum5 = rollapplyr(tried_ann, width=list(-1:-5), sum, # 5 year moving sum
                                 na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(tried_sum10 = rollapplyr(tried_ann, width=list(-1:-10), sum, # 10 year moving sum
                                  na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(tried_sum20 = rollapplyr(tried_ann, width=list(-1:-20), sum, # 20 year moving sum
                                  na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(tried_sum30 = rollapplyr(tried_ann, width=list(-1:-30), sum, # 30 year moving sum
                                  na.rm=TRUE, fill=NA, align="right"))

trials_net_matrix <- panel_dataset_net %>%
  dplyr::select(year, newID, tried) %>%
  filter(tried>0) %>% # since this panel is not only 1's (balanced) 
  group_by(year) %>%
  nest(., Prosecutors_0 = c(newID), Trials_0 = c(tried)) %>%
  full_join(trials_net_matrix, by="year") %>% # join lists and sums together
  ungroup() %>%
  arrange(year) %>%
  # before lag, make sure to include all years
  complete(year=1300:1700) %>%
  mutate(Prosecutors_1=lag(Prosecutors_0, n=1, default = list(NULL)),
         Trials_1=lag(Trials_0, n=1, default = list(NULL))) # lag by 1 year

# flatten list-column contents from lists of tibbles to lists
Prosecutors_1 <- vector("list", dim(trials_net_matrix)[1])
for (i in 1:dim(trials_net_matrix)[1]){
  flat.ids <- trials_net_matrix$Prosecutors_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    Prosecutors_1[[i]] <- Reduce(rbind, flat.ids)
  } else {Prosecutors_1[[i]] <- list(NULL)}
}
trials_net_matrix$Prosecutors_1 <- Prosecutors_1

Trials_1 <- vector("list", dim(trials_net_matrix)[1])
for (i in 1:dim(trials_net_matrix)[1]){
  flat.ids <- trials_net_matrix$Trials_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    Trials_1[[i]] <- Reduce(rbind, flat.ids)
  } else {Trials_1[[i]] <- list(NULL)}
}
trials_net_matrix$Trials_1 <- Trials_1

# merge into the panel dataset
panel_dataset_net <- panel_dataset_net %>%
  left_join(trials_net_matrix, by = c("year")) %>% # keep all rows of x (main df), add y (sums)
  arrange(bhpr_runningnr, year)

###---###---###---###---###---###---###---###---###---###---
# 8. Distance calculations ####
###---###---###---###---###---###---###---###---###---###---

# First, use trials reprojected from WGS84 to LAEA in LR script 8, which will give more accurate distances and which also has the advantage of being meters-based (the calculated units will make much more sense)

trials_net_coords <- trials_net_laea %>% 
  dplyr::select(newID, lon, lat) %>%
  distinct(newID, lon, lat) %>%
  arrange(newID) %>%
  column_to_rownames(var="newID")

battles_coords <- battles_laea %>% # don't need to be distinct here
  dplyr::select(newID, lon, lat) %>%
  column_to_rownames(var="newID")

demonology_coords <- demonology %>% 
  dplyr::select(StdName, XCOORD, YCOORD) %>%
  mutate(StdName = str_replace_all(StdName, " ",".")) %>%
  distinct(StdName, XCOORD, YCOORD) %>%
  mutate(lon = XCOORD, lat = YCOORD) %>%
  dplyr::select(-XCOORD, -YCOORD) %>%
  column_to_rownames(var="StdName")

# The coord ID systems use different naming conventions (trials:"ID_xx" network cities: "BHPR_xx" battles:"Btl_xx" printers:"StdName"), so they can all go in the same distance matrix that we can reference in multiple calculations
all_coords <- rbind(trials_net_coords, battles_coords, demonology_coords)

# Create a distance matrix with the diagonal and upper half filled in - will need all values in order to do any functions with the matrix
dist_euc_matrix <- all_coords %>%
  dist(., method = "euclidean", diag = TRUE, upper = TRUE)/1000 # distances in meters, so divide for km
dist_euc_matrix <- as.data.frame(as.matrix(dist_euc_matrix))

# To correct/allow for slight differences in coords, make all distances of <0.1km (.06mi) to 0.1km. Values of 0 have to be replaced anyway, as the calculations don't work with 0 in the denominator.


# 8.1. calculate spatial diff of witch-hunting ideas ----
# with previous period = 1 year, scaled by sqrts of distances to x=1:
# produces higher values when more and close editions,
# lower values when fewer or farther copies

# MM only #
panel_dataset_net <- panel_dataset_net %>%
  mutate(diffMM_1 = rep(NA, dim(.)[1]))

for (i in 1: dim(panel_dataset_net)[1]){
  # if prior MM editions for t=1
  if(!is.null(panel_dataset_net$MMCopies_1[[i]][[1]])){ 
    dist.subs <- dist_euc_matrix[panel_dataset_net$newID[i], 
                                 panel_dataset_net$MMPrinters_1[[i]]]
    # see note at start of this section on spatial diffusion calculations
    dist.subs[dist.subs<=0.1] <- 0.1 
    panel_dataset_net$diffMM_1[i] <- sum(rep(1,panel_dataset_net$MM_lag1[i])/sqrt(dist.subs), 
                                          na.rm=TRUE)
  } else {panel_dataset_net$diffMM_1[i] <- 0} # if no prior for t=1, fill 0
}

# any text #
panel_dataset_net <- panel_dataset_net %>%
  mutate(difftexts_1 = rep(NA, dim(.)[1]))

for (i in 1: dim(panel_dataset_net)[1]){
  # if any prior editions for t=1
  if(!is.null(panel_dataset_net$AllCopies_1[[i]][[1]])){ 
    dist.subs <- dist_euc_matrix[panel_dataset_net$newID[i], 
                                 panel_dataset_net$AllPrinters_1[[i]]]
    dist.subs[dist.subs<=0.1] <- 0.1 # see note at start of section
    panel_dataset_net$difftexts_1[i] <- sum(rep(1,panel_dataset_net$texts_lag1[i])/sqrt(dist.subs), 
                                             na.rm=TRUE)
  } else {panel_dataset_net$difftexts_1[i] <- 0} # if no prior for t=1, fill 0
}

# non-MM text #

panel_dataset_net <- panel_dataset_net %>%
  mutate(difftexts_not_1 = rep(NA, dim(.)[1]))

for (i in 1: dim(panel_dataset_net)[1]){
  # if any prior editions for t=1
  if(!is.null(panel_dataset_net$AllCopies_not_1[[i]][[1]])){ 
    dist.subs <- dist_euc_matrix[panel_dataset_net$newID[i], 
                                 panel_dataset_net$AllPrinters_not_1[[i]]]
    dist.subs[dist.subs<=0.1] <- 0.1 # see note at start of section
    panel_dataset_net$difftexts_not_1[i] <- sum(rep(1, panel_dataset_net$texts_not_lag1[i])/sqrt(dist.subs), 
                                            na.rm=TRUE)
  } else {panel_dataset_net$difftexts_not_1[i] <- 0} # if no prior for t=1, fill 0
}

# 8.2. calculate spatial diff of witch-hunting activities ----
# with previous period = 1 year, scaled by sqrts of distances to x=1:
# produces higher values when more and close trials,
# lower values when fewer or farther trials

panel_dataset_net <- panel_dataset_net %>%
  mutate(difftried_1 = rep(NA, dim(.)[1]))

for (i in 1: dim(panel_dataset_net)[1]){
  if(!is.null(panel_dataset_net$Prosecutors_1[[i]][[1]])){ # if prior trials for t=1
    dist.subs <- dist_euc_matrix[panel_dataset_net$newID[i], panel_dataset_net$Prosecutors_1[[i]]]
    dist.subs[dist.subs<=0.1] <- 0.1 # see note at start of section
    panel_dataset_net$difftried_1[i] <- sum(panel_dataset_net$Trials_1[i][[1]]/sqrt(dist.subs), 
                                             na.rm=TRUE)
  } else {panel_dataset_net$difftried_1[i] <- 0} # if no prior for t=1, fill 0
}


# 8.3. calculate spatial influence of confessional battles ----
# with previous period = 1 years;
# in a later step this will be aggregated into prior 10 years

panel_dataset_net <- panel_dataset_net %>%
  mutate(diffbattle_1 = rep(NA, dim(.)[1]))

for (i in 1: dim(panel_dataset_net)[1]){
  if(!is.null(panel_dataset_net$Battles_1[[i]][[1]])){ # if prior battles for t=1
    dist.subs <- dist_euc_matrix[panel_dataset_net$newID[i], panel_dataset_net$Battles_1[[i]]]
    
    dist.subs[dist.subs<=0.1] <- 0.1 # see note at start of section
    panel_dataset_net$diffbattle_1[i] <- sum(panel_dataset_net$battles_lag1[i]/sqrt(dist.subs), 
                                              na.rm=TRUE)
  } else {panel_dataset_net$diffbattle_1[i] <- 0} # if no prior for t=1, fill 0
}

# 8.4. calculate rolling diffusion values ----
# Only one row per year, so it's straightforward
panel_dataset_net <- panel_dataset_net %>%
  arrange(year) %>%
  group_by(bhpr_runningnr) %>% 
  # complete(year=1300:1700) %>% # don't need this anymore
  # create moving sums for different windows
  mutate(diffMM_5 = rollapplyr(diffMM_1, 5, sum, 
                               na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         diffMM_10 = rollapplyr(diffMM_1, 10, sum, 
                                na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         diffMM_20 = rollapplyr(diffMM_1, 20, sum, 
                                na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_5 = rollapplyr(difftexts_1, 5, sum, 
                                  na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_10 = rollapplyr(difftexts_1, 10, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_20 = rollapplyr(difftexts_1, 20, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_not_5 = rollapplyr(difftexts_not_1, 5, sum, 
                                  na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_not_10 = rollapplyr(difftexts_not_1, 10, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftexts_not_20 = rollapplyr(difftexts_not_1, 20, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftried_5 = rollapplyr(difftried_1, 5, sum, 
                                  na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftried_10 = rollapplyr(difftried_1, 10, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         difftried_20 = rollapplyr(difftried_1, 20, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         diffbattle_5 = rollapplyr(diffbattle_1, 5, sum, 
                                   na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         diffbattle_10 = rollapplyr(diffbattle_1, 10, sum, 
                                    na.rm=TRUE, fill=NA, partial=TRUE, align="right"),
         diffbattle_20 = rollapplyr(diffbattle_1, 20, sum, 
                                    na.rm=TRUE, fill=NA, partial=TRUE, align="right")) %>%
  ungroup() %>%
  arrange(bhpr_runningnr, year)

saveRDS(panel_dataset_net, here::here("Witchtrial_diffusion","out_data", "panel_dataset_net_withdiff.rds"))
# or, grab the already-prepped file
# panel_dataset_net <- read_rds(here::here("Witchtrial_diffusion","out_data", "panel_dataset_net_withdiff.rds"))

###---###---###---###---###---###---###---###---###---###---
# 9. Final adjustments ####
###---###---###---###---###---###---###---###---###---###---

length(unique(panel_dataset_net$newID)) # 585 cities
nrow(panel_dataset_net) #235250 obs

# dummy for if a city was either a diocesan or archdiocesan seat
panel_dataset_net <- panel_dataset_net %>%
  mutate(cath_cheney = if_else(dio_cheney==1|archdio_cheney==1,1,0))

# 9.1 Add a regional dummy ----
# because of spatial clustering in the secondary sources
# make 3x3 grid cells using the EEA reference grid, based on where we have high/dense/thorough coverage from the secondary sources
# We chose the grid cells based on trying to encompass these areas without diluting them, and based on including land area without sea
# amounts to grid cells across northern France, Bel, Neth, Switz, Germ minus E Germ

panel_dataset_net <- panel_dataset_net %>%
  mutate(grid = case_when(
    lon > 3700000 & lon < 4000000 & lat > 2900000 & lat < 3200000 ~ 1,
    lon > 4000000 & lon < 4300000 & lat > 2500000 & lat < 2800000 ~ 2,
    lon > 4000000 & lon < 4300000 & lat > 2800000 & lat < 3100000 ~ 3,
    lon > 4000000 & lon < 4300000 & lat > 3100000 & lat < 3400000 ~ 4,
    lon > 4300000 & lon < 4600000 & lat > 2600000 & lat < 2900000 ~ 5,
    lon > 4300000 & lon < 4600000 & lat > 2900000 & lat < 3200000 ~ 6,
    lon > 4300000 & lon < 4600000 & lat > 3200000 & lat < 3500000 ~ 7,
    # lon > 4600000 & lon < 4900000 & lat > 2600000 & lat < 2900000 ~ 8, # only 2% of obs, so omitting (will be a problem in models)
    TRUE ~ 0
  )) %>%
  mutate(
    grid_0 = ifelse(grid==0, 1, 0),
    grid_1 = ifelse(grid==1, 1, 0),
    grid_2 = ifelse(grid==2, 1, 0),
    grid_3 = ifelse(grid==3, 1, 0),
    grid_4 = ifelse(grid==4, 1, 0),
    grid_5 = ifelse(grid==5, 1, 0),
    grid_6 = ifelse(grid==6, 1, 0),
    grid_7 = ifelse(grid==7, 1, 0))

prop.table(table(panel_dataset_net$grid))
# range from 6.5% of cases (5) to 18.1% (6)

# 9.2 check the comparison between activities and ideas ----

# nrow(trials_clean[trials_clean$year<1550&!is.na(trials_clean$year),]) # 757
# nrow(trials_clean[trials_clean$year>1487&trials_clean$year<1550&!is.na(trials_clean$year),]) # 315 after first MM in first MM wave
# nrow(trials_clean[trials_clean$year<1488&!is.na(trials_clean$year),]) # 442 before MM

# Reverse the printing/trial lags
# to check whether trials spurred printing?
trials_years <- trials_matrix %>%
  full_join(malleus_matrix, by = c("year"="Year")) %>%
  full_join(texts_matrix, by = c("year"="Year"))

trials_years_long <- trials_years %>%
  dplyr::select(year, nobs, tried_ann, texts_ann) %>%
  gather(key="key", value="value", -year)

ggplot(data=trials_years_long, aes(x=year, y=value)) +
  geom_bar(stat="identity") +
  facet_grid(rows = vars(key))

# Judging by this figure, we'll need to test both whether trials drive publication demand and whether publication drives trials! It looks like trials drove publication for the wave of editions in the late 15th century, and like publications accelerated the number of trials in the late 16th century.

# 9.3 start setting up for survival/hazard analyses ----

panel_dataset_net <- panel_dataset_net %>%
  mutate(triedbin = ifelse(tried>0,1,0)) %>%
  arrange(year) %>%
  group_by(bhpr_runningnr) %>%
  mutate(triedtodate = cumsum(tried),
         firsttrialyr = min(year[triedbin==1])) %>%
            # firsttrialyr = Inf if no trials in a city;  
            # will give warnings, but can ignore them
  mutate(firsttrialyr = ifelse(is.infinite(firsttrialyr),NA,firsttrialyr)) %>%
  ungroup() %>%
  mutate(firsttrial = case_when(
    year==firsttrialyr ~ 1, 
    year>firsttrialyr ~ 2,
    TRUE ~ 0))

# 9.4 Choose a reasonable temporal scope ----

# for number of persons tried
plot(trials_clean$year, trials_clean$tried)
trials_clean[trials_clean$year<1700,] %>% group_by(year) %>% summarize(tried_sum = sum(tried)) %>% ggplot(aes(x=year, y=tried_sum)) + geom_bar(stat="identity")
# not much in 14th c, more activity in 1450-1500 with two local peaks, most of the activity from 1550 on

# for number of trials
trials_clean[trials_clean$year<1700,] %>% group_by(year) %>% summarize(trials = n()) %>% ggplot(aes(x=year, y=trials)) + geom_bar(stat="identity")
# for number of persons tried per trial
trials_clean[trials_clean$year<1700,] %>% group_by(year) %>% summarize(tried_per = sum(tried)/n()) %>% ggplot(aes(x=year, y=tried_per)) + geom_bar(stat="identity")

# higher numbers of tried in 16th/17th c reflects more trials; more persons tried per trial in 15th c

# do the trends in the panel follow those in the trials dataset?
panel_dataset_net[panel_dataset_net$year<1700,] %>% group_by(year) %>% summarize(tried_sum = sum(tried)) %>% ggplot(aes(x=year, y=tried_sum)) + geom_bar(stat="identity")

panel_dataset_net[panel_dataset_net$year<1700,] %>% group_by(year) %>% summarize(trials = sum(triedbin)) %>% ggplot(aes(x=year, y=trials)) + geom_bar(stat="identity")

# high point for total trials and number tried in the BHPR cities is the early 17th c; local blip in the late 15th c, but much more in the decades just before and after 1600

# start with 1400, based on data exploration of the trends?
table(panel_dataset_net$year[panel_dataset_net$firsttrial==1])
# if we start in 1400, we lose about 20 cities

surv_dataset_net <- panel_dataset_net %>%
  filter(firsttrial==0|firsttrial==1) %>% # censor at first trial
  mutate(time=year-1400) # create time since first observation

# 9.5 Add temporal dummies ----
# in case the lags are picking up a time trend instead
# Not one year. Can't be 10 years, since that's what we use for our lags. Need to minimize number of times it would overlap with the 10 year lags, so not a multiple of 10. Can't be too long, because that will soak up the entirety of the time lag effect.

surv_dataset_net <- surv_dataset_net %>%
  mutate(period = cut(year, breaks = seq(1400, 1650, 7), right = T, labels = F),
         period2 = cut(year, breaks = seq(1400, 1700, 7), right = T, labels = F)) %>%
  fastDummies::dummy_cols(select_columns = "period")

round(prop.table(table(surv_dataset_net$period)), 3)
# range from .032% of cases to .025% - even coverage - losing ~15% over time to censoring
round(prop.table(table(surv_dataset_net$period, surv_dataset_net$MM_lag1)), 3)
# editions only in 13 of 35 periods
cor(surv_dataset_net$period, surv_dataset_net$MM_lag1, use="pairwise.complete.obs")
cor(surv_dataset_net$period, surv_dataset_net$firsttrial, use="pairwise.complete.obs")
# not correlated

baseline <- panel_dataset_net %>%
  group_by(year) %>%
  summarize(baseline_ann = sum(ifelse(firsttrial==1|firsttrial==2,1,0))/n()) %>%
  mutate(baseline_lag1 = lag(baseline_ann, 1),
         baseline_mean7 = rollapplyr(baseline_lag1, width=list(-1:-7), mean,
                          na.rm=TRUE, fill=NA, partial=TRUE, align="right"))
baseline_p <- panel_dataset_net %>%
  mutate(period = cut(year, breaks = seq(1400, 1650, 7), right = T, labels = F)) %>%
  group_by(period) %>%
  summarize(baseline_pd = sum(ifelse(firsttrial==1|firsttrial==2,1,0))/n()) %>%
  mutate(baseline_pd_lag1 = lag(baseline_pd, 1),
         baseline_pd_mean3 = rollapplyr(baseline_pd_lag1, width=list(-1:-3), mean,
                                     na.rm=TRUE, fill=NA, partial=TRUE, align="right"))

surv_dataset_net <-
  merge(surv_dataset_net, baseline, by="year") %>%
  merge(baseline_p, by="period")

with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_ann, MM_lag1, use="pairwise.complete.obs")) # correlated
with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_pd, MM_lag1, use="pairwise.complete.obs")) # correlated
with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_pd_mean3, MM_lag1, use="pairwise.complete.obs")) # v small correlation
with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_ann, firsttrial, use="pairwise.complete.obs")) # not correlated
with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_pd, firsttrial, use="pairwise.complete.obs")) # not correlated
with(surv_dataset_net[surv_dataset_net$year>=1400&surv_dataset_net$year<1651,], cor(baseline_pd_mean3, firsttrial, use="pairwise.complete.obs")) # not correlated

surv_dataset_net %>% dplyr::select(period, baseline_pd_mean3) %>% group_by(period) %>% 
  summarize(mean = mean(baseline_pd_mean3)) %>% arrange(-period)
###---###---###---###---###---###---###---###---###---###---
# 10. Analyses ####
###---###---###---###---###---###---###---###---###---###---

# 10.1 Approximate L&R approach  ----
# L&R ran linear models using lm with no specified NA behavior (default is to drop rows) and then did cluster-robust standard errors. Most of their models are not applicable to us, in part because half of them are country-level observations.

# OLS + country-specific time trends
# t4_5a <- "ln.trials ~ battles + factor(country) + factor(decade)"
# t4_5a <- "ln.trials ~ battles_sum10 + factor(gadm.adm0) + factor(decade)"
# 
# # Run each model
# results_t4_5a <- lm(formula = t4_5a, data = panel_dataset_site)
# 
# # Use cluster-robust standard errors 
# p_load(sandwich)
# p_load(broom)
# clrse_t4_5a <- results_t4_5a  %>% 
#   vcovHC(type = "HC1", cluster = "group", adjust = TRUE) %>% 
#   diag() %>% 
#   sqrt()
# 
# stargazer(results_t4_5a, 
#           se                = list(clrse_t4_5a),
#           omit              = c("factor", "time.id", "Constant"),
#           header            = FALSE,
#           model.names       = FALSE,
#           dep.var.caption   = "Panel A: Ln persons tried",
#           table.layout      = "=!l#-t-as-!", # Custom table formatting
#           float             = FALSE,
#           omit.stat         = c("adj.rsq", "ser", "f"),
#           type              = "text")

###---###---###---###---###---###---###---###---###---###---
# 11. Data shaping for analyses ####
###---###---###---###---###---###---###---###---###---###---

# because this panel is balanced, there are a lot of NA for ln.trials (0 trials), and limiting to complete cases then removes all the 0s...
model_vars <- c("time", "firsttrial", "tried", "triedbin",
                "MM_sum10", "texts_sum10", "texts_not_sum10", 
                "diffMM_10", "difftexts_10", "difftexts_not_10", 
                "tried_sum10","difftried_10", "battles_sum10", "diffbattle_10", 
               "degree", "between", "eigen",
               "hanse_dollinger", "hanse_pagel", "freeimp_jacob", #gov capacity
               "dio_cheney", "archdio_cheney", "cath_cheney", # rel capacity
               "prc_lag1", "temp_lag1", "prc_adj_lag", "temp_adj_lag", # climate
               "prc_shock", "prc_shock_hi", "prc_shock_lo",
               "temp_shock", "temp_shock_hi", "temp_shock_lo",
               # grouping/control vars
               "decade", "year", 
               "bhpr_runningnr", "grid",
               "grid_0", "grid_1", "grid_2", "grid_3", 
               "grid_4", "grid_5", "grid_6", "grid_7")

surv_sample <- surv_dataset_net[names(surv_dataset_net) %in% model_vars]
surv_sample <- surv_sample[complete.cases(surv_sample),]
surv_sample_ext <- filter(surv_sample, year>=1400&year<1701) # USTC extended
surv_sample <- filter(surv_sample, year>=1400&year<1651)

# transform continuous variables to work better in regression? the outcome var is continuous and left-skewed (for the net, very zero-inflated)
# to centering them on mean=0 and standardizing them so that 1 unit is 1 std dev
scale_vars <- c("MM_sum10", "texts_sum10", "texts_not_sum10", 
                "diffMM_10", "difftexts_10", "difftexts_not_10", 
                "tried_sum10","difftried_10", "battles_sum10", "diffbattle_10",
                "prc_lag1", "temp_lag1", "prc_adj_lag", "temp_adj_lag") 
# including the climate vars already adjusted to local values means something like normalizing the amount of variation that happens at a local level
surv_sample_scaled <- surv_sample %>%
  mutate(across(all_of(scale_vars), ~scale(.) %>% as.vector))


###---###---###---###---###---###---###---###---###---###---
# 12. Make Analysis RMD ####
###---###---###---###---###---###---###---###---###---###---

rmarkdown::render(here::here("Witchtrial_diffusion", "scripts", "Witches_Analysis.Rmd"))
browseURL(paste("file://", file.path(getwd(), "Witchtrial_diffusion", "scripts", "Witches_Analysis.html"), sep= ""))

